{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>品名</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ZARA 新款 女装 Z1975 喇叭牛仔裤 07147031407</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ZARA 新款 宽松迷笛白色连衣裙 01930303251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ZARA 春季 新款 女装 叠层装饰印花衬衫 08351227330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ZARA 春季 新款 女装 开叉设计迷笛连衣裙 02636703601</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ZARA 新款 人造珠宝纽扣针织外套 05802029712</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    品名\n",
       "0   ZARA 新款 女装 Z1975 喇叭牛仔裤 07147031407\n",
       "1        ZARA 新款 宽松迷笛白色连衣裙 01930303251\n",
       "2   ZARA 春季 新款 女装 叠层装饰印花衬衫 08351227330\n",
       "3  ZARA 春季 新款 女装 开叉设计迷笛连衣裙 02636703601\n",
       "4       ZARA 新款 人造珠宝纽扣针织外套 05802029712"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读取数据集\n",
    "df_raw = pd.read_excel('./模糊匹配案例.xlsx',sheet_name='data')\n",
    "df_raw.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>关键词</th>\n",
       "      <th>类目</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>裙</td>\n",
       "      <td>裙类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>裤</td>\n",
       "      <td>裤类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>衬衫</td>\n",
       "      <td>衬衫</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>外套</td>\n",
       "      <td>外套</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>T恤</td>\n",
       "      <td>T恤</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  关键词  类目\n",
       "0   裙  裙类\n",
       "1   裤  裤类\n",
       "2  衬衫  衬衫\n",
       "3  外套  外套\n",
       "4  T恤  T恤"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读取参数集\n",
    "df_param = pd.read_excel('./模糊匹配案例.xlsx',sheet_name='param')\n",
    "df_param"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>品名</th>\n",
       "      <th>关键词</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ZARA 新款 女装 Z1975 喇叭牛仔裤 07147031407</td>\n",
       "      <td>裤</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ZARA 新款 宽松迷笛白色连衣裙 01930303251</td>\n",
       "      <td>裙</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ZARA 春季 新款 女装 叠层装饰印花衬衫 08351227330</td>\n",
       "      <td>衬衫</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ZARA 春季 新款 女装 开叉设计迷笛连衣裙 02636703601</td>\n",
       "      <td>裙</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ZARA 新款 人造珠宝纽扣针织外套 05802029712</td>\n",
       "      <td>外套</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    品名 关键词\n",
       "0   ZARA 新款 女装 Z1975 喇叭牛仔裤 07147031407   裤\n",
       "1        ZARA 新款 宽松迷笛白色连衣裙 01930303251   裙\n",
       "2   ZARA 春季 新款 女装 叠层装饰印花衬衫 08351227330  衬衫\n",
       "3  ZARA 春季 新款 女装 开叉设计迷笛连衣裙 02636703601   裙\n",
       "4       ZARA 新款 人造珠宝纽扣针织外套 05802029712  外套"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 文本模糊匹配\n",
    "def kw(df):\n",
    "    keywords = list(df_param['关键词'].unique())\n",
    "\n",
    "    key_box = []\n",
    "    for keyword in keywords:\n",
    "        pattern = re.compile('.*' + keyword + '.*')\n",
    "        if pattern.match(df) is not None:\n",
    "            key_box.append(keyword)\n",
    "            # print(key_box)\n",
    "    return key_box\n",
    "\n",
    "df_raw['关键词'] = df_raw['品名'].apply(kw)\n",
    "df_raw = df_raw.explode('关键词')\n",
    "df_raw.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>品名</th>\n",
       "      <th>类目</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ZARA 新款 女装 Z1975 喇叭牛仔裤 07147031407</td>\n",
       "      <td>裤类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ZARA 新款 宽松迷笛白色连衣裙 01930303251</td>\n",
       "      <td>裙类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ZARA 春季 新款 女装 叠层装饰印花衬衫 08351227330</td>\n",
       "      <td>衬衫</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ZARA 春季 新款 女装 开叉设计迷笛连衣裙 02636703601</td>\n",
       "      <td>裙类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ZARA 新款 人造珠宝纽扣针织外套 05802029712</td>\n",
       "      <td>外套</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    品名  类目\n",
       "0   ZARA 新款 女装 Z1975 喇叭牛仔裤 07147031407  裤类\n",
       "1        ZARA 新款 宽松迷笛白色连衣裙 01930303251  裙类\n",
       "2   ZARA 春季 新款 女装 叠层装饰印花衬衫 08351227330  衬衫\n",
       "3  ZARA 春季 新款 女装 开叉设计迷笛连衣裙 02636703601  裙类\n",
       "4       ZARA 新款 人造珠宝纽扣针织外套 05802029712  外套"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 关联类目\n",
    "df = pd.merge(df_raw,df_param,on='关键词',how='left')\n",
    "df.drop(columns=['关键词'],inplace=True)\n",
    "df.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
